workingdir="/Users/elsa/Desktop/THESIS/sled_dog_metagenomics"
setwd(workingdir)
load("resources/data/data.Rdata")
read_counts_row <- column_to_rownames(read_counts, "genome")
nsamples <- ncol(read_counts_row) # define number of samples, by reading from the counts table
metagenomic_bases <- sum(sample_metadata$metagenomic_bases)
host_bases <- sum(sample_metadata$host_bases)
discarded_bases <- sum(round(((sample_metadata$metagenomic_bases+sample_metadata$host_bases)/(1-sample_metadata$bases_lost_fastp_percent))-(sample_metadata$metagenomic_bases+sample_metadata$host_bases))) #define amount of discarded bases from quality filtering
total_bases <- discarded_bases + host_bases + metagenomic_bases
singlem_bases <- sum(sample_metadata$metagenomic_bases * sample_metadata$singlem_fraction)
nmags <- nrow(read_counts_row) # establish number of MAGs for analysis
sequencing_depth <- colSums(read_counts_row)
sequencing_depth_sum <- sum(sequencing_depth)
sequencing_depth_mean <- mean(sequencing_depth)
sequencing_depth_sd <- sd(sequencing_depth)
Number of samples in total
[1] 58
Number of MAGs The number of metagenome-assembled genomes (MAG) or draft bacterial genomes reconstructed from the metagenomic data.
555
Amount of total data (GB): The amount of total DNA data sequenced in gigabases (GB, one billion nucleotide bases).
333.71
Amount of discarded data (GB): The amount of data discarded due to low quality or lack of informativeness during data preprocesing. Discarding 5-15% of the produced data is within the expected range, due to formation of adaptor dimers, inclusion of adaptors in sequencing reads due to short insert sizes, low sequencing quality, etc.
10.36577
Amount of discarded data (in % of the raw data):
3.11
Amount of host data (GB): The amount of data mapped against the host genome. The percentage refers to the amount of data mapped to the host genome respect to quality-filtered data. Note that this value can be very variable depending on the biological features of the sample (e.g., anal swabs contain more host DNA than faeces) and the employed reference genome (e.g., the chances for mapping to the genome are lower as the distance between) the study species and the employed reference genome differ).
6.49
Amount of host data (% of the quality-filtered data):
2.01
Estimated prokaryotic data: The amount and proportion of data belonging to prokayotic genomes respect to the total metagenomic fraction, as estimated from singleM analysis. Note that this is an estimation that relies on the genome sizes of genomes available in reference databases. If a given taxon is not properly represented, genome size estimations can be less accurate.
293.14
Estimated prokaryotic data (% of the metagenomic data):
92.51
Amount of metagenomic data (GB): The amount of data mapped against the host genome. The percentage refers to the amount of data mapped to the host genome respect to quality-filtered data. Note that this value can be very variable depending on the biological features of the sample (e.g., anal swabs contain more host DNA than faeces) and the employed reference genome (e.g., the chances for mapping to the genome are lower as the distance between) the study species and the employed reference genome differ).
316.85
Amount of metagenomic data (% of the quality-filtered data):
97.99
Total mapped sequencing depth (million reads): The amount of reads (and nucleotide bases) that were mapped to the entire MAG catalogue. Note that the amount of bases is only an approximation estimated by multiplying the exact number of mapped reads by 250 bp.
1860.26
Total mapped sequencing depth (GB):
266.02
Average mapped sequencing depth (million reads): This is the average number of reads (and nucleotide bases) mapped to each sample. Note that the amount of bases is only an approximation estimated by multiplying the exact number of mapped reads by 250 bp.
32.07
Average mapped sequencing depth (GB):
4.59
Number of MAGs without species-level annotation
244
Percentage of MAGs without species-level annotation
43.96396
Number of phyla
[1] 12
Number of genera
[1] 135
heatmap <- ehi_phylum_colors %>%
dplyr::right_join(genome_metadata, by=join_by(phylum == phylum)) %>%
arrange(match(genome, tree$tip.label)) %>%
dplyr::select(genome,phylum) %>%
mutate(phylum = factor(phylum, levels = unique(phylum))) %>%
column_to_rownames(var = "genome")
circular_tree <- force.ultrametric(tree,method="extend") %>%
ggtree(., layout = 'circular', size = 0.3)
# Phylum ring
circular_tree <- gheatmap(circular_tree, heatmap, offset=0.65, width=0.1, colnames=FALSE) +
scale_fill_manual(values=colors_alphabetic) +
geom_tiplab2(size=1, hjust=-0.1) +
theme(plot.margin = margin(0, 0, 0, 0), panel.margin = margin(0, 0, 0, 0))
# Flush color scale to enable a new color scheme in the next ring
circular_tree <- circular_tree + new_scale_fill()
# Add completeness ring
circular_tree <- circular_tree +
new_scale_fill() +
scale_fill_gradient(low = "#d1f4ba", high = "#f4baba") +
geom_fruit(
data=genome_metadata,
geom=geom_bar,
mapping = aes(x=completeness, y=genome, fill=contamination),
offset = 0.55,
orientation="y",
stat="identity")
# Add genome-size ring
circular_tree <- circular_tree +
new_scale_fill() +
scale_fill_manual(values = "#cccccc") +
geom_fruit(
data=genome_metadata,
geom=geom_bar,
mapping = aes(x=mag_size, y=genome),
offset = 0.05,
orientation="y",
stat="identity")
#Plot circular tree
circular_tree
Overview of the taxonomy and genome characteristics of the MAGs.
Completeness: completeness of the MAG according to
CheckM assessment.
Contamination: contamination or redundancy of the MAG
according to CheckM assessment.
Size: size of the MAG in megabases (MB, one million
nucleotide bases).
genome_counts %>%
mutate_at(vars(-genome),~./sum(.)) %>% #apply TSS nornalisation
pivot_longer(-genome, names_to = "sample", values_to = "count") %>% #reduce to minimum number of columns
dplyr::left_join(., genome_metadata, by = join_by(genome == genome)) %>% #append genome metadata
dplyr::left_join(., sample_metadata, by = join_by(sample == sample)) %>% #append sample metadata
ggplot(., aes(x=sample,y=count, fill=phylum, group=phylum)) + #grouping enables keeping the same sorting of taxonomic units
geom_bar(stat="identity", colour="white", linewidth=0.1) + #plot stacked bars with white borders
scale_fill_manual(values=phylum_colors) +
labs(y = "Relative abundance") +
facet_grid(.~region, scales="free_x")+
# facet_nested(.~Individual+Sample_type, scales="free_x") +
guides(fill = guide_legend(ncol = 1)) +
theme(axis.text.x=element_blank(),
axis.title.x = element_blank(),
axis.ticks.x=element_blank(),
panel.background = element_blank(),
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(linewidth = 0.5, linetype = "solid", colour = "black")) +
labs(fill="Phylum")
Diversity estimations for each sample.
Richness: Number of MAGs per sample (after applying
coverage filter).
Neutral diversity: Hill number of q=1 (Shannon
diversity), a diversity metric that accounts for richness and eveness
(relative abundances) of the MAGs.
Phylogenetic diversity: Phylogenetic Hill number of
q=1, a diversity metric that accounts for richness and eveness (relative
abundances), as well as phylogenetic relationships among MAGs.
Functional diversity: Functional Hill number of q=1, a
diversity metric that accounts for richness and eveness (relative
abundances), as well as functional dissimilarities among MAGs.
#Calculate Hill numbers
richness <- genome_counts %>%
column_to_rownames(var="genome") %>%
dplyr::select(where(~!all(. == 0))) %>%
hilldiv(.,q=0) %>%
t() %>%
as.data.frame() %>%
dplyr::rename(richness=1) %>%
rownames_to_column(var="sample")
neutral <- genome_counts %>%
column_to_rownames(var="genome") %>%
dplyr::select(where(~!all(. == 0))) %>%
hilldiv(.,q=1) %>%
t() %>%
as.data.frame() %>%
dplyr::rename(neutral=1) %>%
rownames_to_column(var="sample")
phylogenetic <- genome_counts %>%
column_to_rownames(var="genome") %>%
dplyr::select(where(~!all(. == 0))) %>%
hilldiv(.,q=1,tree=tree) %>%
t() %>%
as.data.frame() %>%
dplyr::rename(phylogenetic=1) %>%
rownames_to_column(var="sample")
# Aggregate basal GIFT into elements
dist <- genome_gifts %>%
to.elements(., GIFT_db) %>%
traits2dist(., method="gower")
genome_counts_filt <- genome_counts[genome_counts$genome %in% rownames(genome_gifts),]
rownames(genome_counts_filt) <- NULL
functional <- genome_counts_filt %>%
column_to_rownames(var="genome") %>%
dplyr::select(where(~!all(. == 0))) %>%
hilldiv(.,q=1,dist=dist) %>%
t() %>%
as.data.frame() %>%
dplyr::rename(functional=1) %>%
rownames_to_column(var="sample") %>%
mutate(functional = if_else(is.nan(functional), 1, functional))
# Merge all metrics
alpha_div <- richness %>%
dplyr::full_join(neutral,by=join_by(sample==sample)) %>%
dplyr::full_join(phylogenetic,by=join_by(sample==sample)) %>%
dplyr::full_join(functional,by=join_by(sample==sample)) %>%
dplyr::left_join(., sample_metadata, by = join_by(sample == sample))
richness_mean <- alpha_div %>%
group_by(region) %>%
dplyr::summarise_at(.vars = names(.)[2], .funs = c("Richness mean" = "mean", "Richness sd" = "sd")) %>%
dplyr::rename("Region"="region")
neutral_mean <- alpha_div %>%
group_by(region) %>%
dplyr::summarise_at(.vars = names(.)[3], .funs = c("Neutral mean" = "mean", "Neutral sd" = "sd"))
phylogenetic_mean <- alpha_div %>%
group_by(region) %>%
dplyr::summarise_at(.vars = names(.)[4], .funs = c("Phylogenetic mean" = "mean", "Phylogenetic sd" = "sd"))
functional_mean <- alpha_div %>%
group_by(region) %>%
dplyr::summarise_at(.vars = names(.)[5], .funs = c("Functional mean" = "mean", "Functional sd" = "sd"))
cbind(richness_mean, neutral_mean[, 2:3], phylogenetic_mean[, 2:3],
functional_mean[, 2:3]) %>%
as.data.frame()%>%
t() %>%
row_to_names(row_number = 1) %>%
as.data.frame() %>%
mutate_if(is.character, as.numeric) %>%
knitr::kable(.,digits = c(3,3))
| Daneborg | Ittoqqortoormiit | |
|---|---|---|
| Richness mean | 233.103 | 232.138 |
| Richness sd | 48.115 | 44.915 |
| Neutral mean | 102.846 | 96.607 |
| Neutral sd | 20.952 | 37.606 |
| Phylogenetic mean | 5.129 | 5.936 |
| Phylogenetic sd | 0.812 | 1.382 |
| Functional mean | 1.423 | 1.453 |
| Functional sd | 0.034 | 0.058 |
alpha_colors <- c("#e5bd5b", "#6b7398")
group_n <- alpha_div %>%
dplyr::select(region) %>%
pull() %>%
unique() %>%
length()
# all alpha diversity plots follow the same structure, thus only the script for neutral alpha diversity is shown.
plot1 <- alpha_div %>%
ggplot(aes(x = region, y = richness, group = region, color = region, fill = region)) +
geom_jitter(width = 0.05, size = 1.5, show.legend = FALSE) +
geom_violin(alpha = 0.2, width = 0.3, show.legend = FALSE) +
scale_color_manual(values = alpha_colors[c(1:group_n)]) +
scale_fill_manual(values = paste0(alpha_colors[c(1:group_n)], "50")) +
stat_compare_means(show.legend = FALSE) +
theme(axis.text.x = element_text(vjust = 0.5),
axis.title = element_text(size = 14, face = "bold"),
axis.text = element_text(face = "bold", size = 12),
axis.title.y = element_text(margin = margin(t = 0, r = 20, b = 0, l = 0)),
axis.title.x = element_text(margin = margin(t = 20, r = 0, b = 0, l = 0)),
panel.background = element_blank(),
# panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(size = 0.5, linetype = "solid", colour = "black")) +
guides(colour = guide_legend(override.aes = list(size = 6))) +
labs(x = "Region", y = "Richness")
beta_q0 <- genome_counts%>%
column_to_rownames(., "genome")%>%
hillpair(., q = 0)
beta_q1n <- genome_counts%>%
column_to_rownames(., "genome")%>%
hillpair(., q = 1)
beta_q1p <- genome_counts%>%
column_to_rownames(., "genome")%>%
hillpair(., q = 1, tree = tree)
beta_div_func <- genome_counts_filt%>%
column_to_rownames(., "genome")%>%
hillpair(.,q=1,dist=dist)
save(beta_q0,beta_q1n, beta_q1p, beta_div_func, file = "resources/data/beta_div.Rdata")
beta_metric <- beta_q1n$S
beta_q1n_nmds <- beta_metric %>%
vegan::metaMDS(., trymax = 500, k = 2, verbosity = FALSE) %>%
vegan::scores() %>%
as_tibble(., rownames = "sample") %>%
dplyr::left_join(sample_metadata, by = join_by(sample == sample))
group_n <- length(unique(beta_q1n_nmds$region))
beta_colors <- c("#e5bd5b", "#6b7398")
# all beta diversity plots follow the same format, thus only the script for neutral beta diversity is shown.
q1n_nmds <- beta_q1n_nmds %>%
group_by(region) %>%
mutate(x_cen = mean(NMDS1, na.rm = TRUE)) %>%
mutate(y_cen = mean(NMDS2, na.rm = TRUE)) %>%
ungroup()
ggplot(q1n_nmds, aes(x = NMDS1, y = NMDS2, color = region)) +
scale_color_manual(values = beta_colors[c(1:group_n)]) +
scale_shape_manual(values = 1:10) +
geom_point(size = 4) +
# stat_ellipse(aes(color = beta_q1n_nmds$Groups))+
geom_segment(aes(x = x_cen, y = y_cen, xend = NMDS1, yend = NMDS2), alpha = 0.9) +
theme_classic() +
theme(
axis.text.x = element_text(size = 12),
axis.text.y = element_text(size = 12),
axis.title = element_text(size = 20, face = "bold"),
axis.text = element_text(face = "bold", size = 18),
panel.background = element_blank(),
axis.line = element_line(size = 0.5, linetype = "solid", colour = "black"),
legend.text = element_text(size = 16),
legend.title = element_text(size = 18),
legend.position = "right", legend.box = "vertical"
)
Homogeneity of variance
Permutation test for homogeneity of multivariate dispersions
Permutation: free
Number of permutations: 999
Response: Distances
Df Sum Sq Mean Sq F N.Perm Pr(>F)
Groups 1 0.06203 0.062033 2.7774 999 0.104
Residuals 56 1.25076 0.022335
Pairwise comparisons:
(Observed p-value below diagonal, permuted p-value above diagonal)
Daneborg Ittoqqortoormiit
Daneborg 0.106
Ittoqqortoormiit 0.10119
Permanova
| Df | SumOfSqs | R2 | F | Pr(>F) | |
|---|---|---|---|---|---|
| region | 1 | 2.376 | 0.288 | 22.631 | 0.001 |
| sex | 2 | 0.200 | 0.024 | 0.951 | 0.440 |
| Residual | 54 | 5.670 | 0.688 | NA | NA |
| Total | 57 | 8.245 | 1.000 | NA | NA |
Homogeneity of variance
Permutation test for homogeneity of multivariate dispersions
Permutation: free
Number of permutations: 999
Response: Distances
Df Sum Sq Mean Sq F N.Perm Pr(>F)
Groups 1 0.02909 0.029089 2.7299 999 0.103
Residuals 56 0.59672 0.010656
Pairwise comparisons:
(Observed p-value below diagonal, permuted p-value above diagonal)
Daneborg Ittoqqortoormiit
Daneborg 0.104
Ittoqqortoormiit 0.10408
Permanova
| Df | SumOfSqs | R2 | F | Pr(>F) | |
|---|---|---|---|---|---|
| region | 1 | 0.139 | 0.102 | 6.326 | 0.001 |
| sex | 2 | 0.042 | 0.031 | 0.964 | 0.403 |
| Residual | 54 | 1.184 | 0.867 | NA | NA |
| Total | 57 | 1.365 | 1.000 | NA | NA |
Homogeneity of variance
Permutation test for homogeneity of multivariate dispersions
Permutation: free
Number of permutations: 999
Response: Distances
Df Sum Sq Mean Sq F N.Perm Pr(>F)
Groups 1 0.001164 0.0011635 0.2775 999 0.627
Residuals 56 0.234777 0.0041924
Pairwise comparisons:
(Observed p-value below diagonal, permuted p-value above diagonal)
Daneborg Ittoqqortoormiit
Daneborg 0.632
Ittoqqortoormiit 0.60041
Permanova
| Df | SumOfSqs | R2 | F | Pr(>F) | |
|---|---|---|---|---|---|
| region | 1 | 0.016 | 0.043 | 2.521 | 0.186 |
| Residual | 56 | 0.363 | 0.957 | NA | NA |
| Total | 57 | 0.379 | 1.000 | NA | NA |
#Aggregate bundle-level GIFTs into the compound level
GIFTs_elements <- to.elements(genome_gifts,GIFT_db)
GIFTs_elements_filtered <- GIFTs_elements[rownames(GIFTs_elements) %in% genome_counts_filt$genome,]
GIFTs_elements_filtered <- as.data.frame(GIFTs_elements_filtered) %>%
select_if(~ !is.numeric(.) || sum(.) != 0)
#Aggregate element-level GIFTs into the function level
GIFTs_functions <- to.functions(GIFTs_elements_filtered,GIFT_db)
#Aggregate function-level GIFTs into overall Biosynthesis, Degradation and Structural GIFTs
GIFTs_domains <- to.domains(GIFTs_functions,GIFT_db)
#Get community-weighed average GIFTs per sample
genome_counts_row <- genome_counts %>%
mutate_at(vars(-genome),~./sum(.)) %>%
column_to_rownames(., "genome")
GIFTs_elements_community <- to.community(GIFTs_elements_filtered,genome_counts_row,GIFT_db)
GIFTs_functions_community <- to.community(GIFTs_functions,genome_counts_row,GIFT_db)
GIFTs_domains_community <- to.community(GIFTs_domains,genome_counts_row,GIFT_db)
element_gift <- GIFTs_elements_community %>%
as.data.frame() %>%
rownames_to_column(., "sample") %>%
merge(., sample_metadata[c(1,4)], by="sample")
uniqueGIFT_db<- unique(GIFT_db[c(2,4,5,6)]) %>% unite("Function",Function:Element, sep= "_", remove=FALSE)
significant_elements <- element_gift %>%
pivot_longer(-c(sample,region), names_to = "trait", values_to = "value") %>%
group_by(trait) %>%
summarise(p_value = wilcox.test(value ~ region, exact = FALSE)$p.value) %>%
mutate(p_adjust=p.adjust(p_value, method="BH")) %>%
filter(p_adjust < 0.05)%>%
rownames_to_column(., "Elements") %>%
dplyr::left_join(.,uniqueGIFT_db[c(1,3)],by = join_by(trait == Code_element))
element_gift_t <- element_gift %>%
t() %>%
row_to_names(row_number = 1) %>%
as.data.frame() %>%
mutate_if(is.character, as.numeric) %>%
rownames_to_column(., "trait")
element_gift_filt <- subset(element_gift_t, trait %in% significant_elements$trait) %>%
t() %>%
row_to_names(row_number = 1) %>%
as.data.frame() %>%
mutate_if(is.character, as.numeric) %>%
rownames_to_column(., "sample")%>%
dplyr::left_join(., sample_metadata[c(1,4)], by = join_by(sample == sample))
element_gift_filt_means <- element_gift_filt %>%
dplyr::select(-sample)%>%
group_by(region) %>%
summarise(across(everything(), mean))%>%
t() %>%
row_to_names(row_number = 1) %>%
as.data.frame() %>%
mutate_if(is.character, as.numeric) %>%
rownames_to_column(., "Elements") %>%
dplyr::left_join(.,uniqueGIFT_db[c(1,3)],by = join_by(Elements == Code_element))
element_gift_filt_sd <- element_gift_filt %>%
dplyr::select(-sample)%>%
group_by(region) %>%
summarise(across(everything(), sd))%>%
t() %>%
row_to_names(row_number = 1) %>%
as.data.frame() %>%
mutate_if(is.character, as.numeric) %>%
rownames_to_column(., "Elements") %>%
dplyr::left_join(.,uniqueGIFT_db[c(1,3)],by = join_by(Elements == Code_element))
only three plots are shown to illustrate the plotting function.
| Code_function | Daneborg | Ittoqqortoormiit | Function |
|---|---|---|---|
| B02 | 0.367 | 0.338 | Amino acid biosynthesis |
| B03 | 0.099 | 0.094 | Amino acid derivative biosynthesis |
| B04 | 0.479 | 0.506 | SCFA biosynthesis |
| B09 | 0.005 | 0.003 | Metallophore biosynthesis |
| D01 | 0.044 | 0.056 | Lipid degradation |
| D02 | 0.204 | 0.223 | Polysaccharide degradation |
| D03 | 0.210 | 0.179 | Sugar degradation |
| D06 | 0.126 | 0.106 | Nitrogen compound degradation |
| D07 | 0.288 | 0.332 | Alcohol degradation |
| D09 | 0.142 | 0.163 | Antibiotic degradation |
| S01 | 0.412 | 0.404 | Cellular structure |
| S03 | 0.223 | 0.272 | Spore |
only 3 plots are shown to illustrate the plotting function.
No differences
element
function
# remove MAGs only present in one region
Daneborg_samples <- sample_metadata %>%
filter(region == "Daneborg") %>%
dplyr::select(sample) %>% pull()
Ittoqqortoormiit_samples <- sample_metadata %>%
filter(region == "Ittoqqortoormiit") %>%
dplyr::select(sample) %>% pull()
structural_zeros <- genome_counts %>%
dplyr::rowwise() %>% #compute for each row (genome)
mutate(all_zeros_Ittoqqortoormiit = all(c_across(all_of(Ittoqqortoormiit_samples)) == 0)) %>% # set true if all samples in Ittoqqortoormiit have zeros
mutate(all_zeros_Daneborg = all(c_across(all_of(Daneborg_samples)) == 0)) %>% # set true if all samples in Daneborg have zeros
mutate(average_Ittoqqortoormiit = mean(c_across(all_of(Ittoqqortoormiit_samples)), na.rm = TRUE)) %>% # get average genome counts across Ittoqqortoormiit
mutate(average_Daneborg = mean(c_across(all_of(Daneborg_samples)), na.rm = TRUE)) %>% # get average genome counts across Daneborg
filter(all_zeros_Ittoqqortoormiit == TRUE || all_zeros_Daneborg==TRUE) %>% # filter only genomes with structural zeros
mutate(present = case_when(
all_zeros_Ittoqqortoormiit & !all_zeros_Daneborg ~ "Daneborg",
!all_zeros_Ittoqqortoormiit & all_zeros_Daneborg ~ "Ittoqqortoormiit",
!all_zeros_Ittoqqortoormiit & !all_zeros_Daneborg ~ "None",
TRUE ~ NA_character_
)) %>%
mutate(average = ifelse(present == "Ittoqqortoormiit", average_Ittoqqortoormiit, average_Daneborg)) %>%
dplyr::select(genome, present, average) %>%
dplyr::left_join(genome_metadata, by=join_by(genome==genome)) %>%
arrange(present,-average)
Create phyloseq object (without structural
zeros)
#phyloseq object without structural zeros
# OTU table
phylo_counts <- genome_counts %>% # genome size normalized counts
dplyr::filter(!genome %in% structural_zeros$genome) %>% # remove structural zeros
column_to_rownames("genome") %>%
mutate_all(~ replace(., . == 0, 0.00001)) %>%
otu_table(., taxa_are_rows = TRUE)
phylo_samples <- sample_metadata %>%
column_to_rownames("sample") %>%
sample_data() #convert to phyloseq sample_data object
# Tax table
phylo_taxonomy <- genome_metadata %>%
filter(genome %in% rownames(phylo_counts)) %>% # remove structural zeros
mutate(genome2=genome) %>% #create a pseudo genome name column
column_to_rownames("genome2") %>%
dplyr::select(domain,phylum,class,order,family,genus,species,genome) %>% #add an additional taxonomic level to ensure genome-level analysis (as no all genomes have species-level taxonomic assignments. Otherwise, ANCOMBC2 aggregates analyses per species)
as.matrix() %>%
tax_table() #convert to phyloseq tax_table object
# Tree
tree <- phytools::force.ultrametric(tree, method = "extend")
## ***************************************************************
## * Note: *
## * force.ultrametric does not include a formal method to *
## * ultrametricize a tree & should only be used to coerce *
## * a phylogeny that fails is.ultrametric due to rounding -- *
## * not as a substitute for formal rate-smoothing methods. *
## ***************************************************************
phylo_tree = phyloseq::phy_tree(tree)
#Generate phyloseq object required to input ANCOMBC
genome_data <- phyloseq(phylo_counts, phylo_taxonomy, phylo_samples, phylo_tree)
#clr transformation
physeq_clr <- microbiome::transform(genome_data, 'clr')
count_crl <- data.frame(physeq_clr@otu_table)
count_crl_t <- data.frame(t(count_crl))
metadata_crl <- data.frame(physeq_clr@sam_data)
metadata_crl <- rownames_to_column(metadata_crl, "sample")
metadata_crl <- metadata_crl[match(rownames(count_crl_t),metadata_crl$sample),]
design <- metadata_crl[, c("sample", "region")]
Daneborg phylum summary
Phylum Mean SD
p__Fusobacteriota 40.23608681 8.74279095
p__Bacteroidota 33.98263660 10.13627575
p__Bacillota_A 11.41121322 5.63703542
p__Pseudomonadota 9.97027845 3.43239492
p__Bacillota 1.78251891 1.07519999
p__Bacillota_C 1.69036077 0.77215177
p__Deferribacterota 0.45273719 0.48325860
p__Campylobacterota 0.31755732 0.58481784
p__Actinomycetota 0.07535572 0.10221945
p__Bacillota_B 0.05231129 0.10290571
p__Spirochaetota 0.02894372 0.06589219
Ittoqqortoormiit phylum summary
Phylum Mean SD
p__Fusobacteriota 37.9104994 18.5204556
p__Bacteroidota 22.6181011 12.0215400
p__Bacillota_A 19.5577191 9.6774519
p__Pseudomonadota 11.6158875 8.8900704
p__Bacillota 4.1347908 5.2284457
p__Bacillota_C 1.7522190 0.7999087
p__Campylobacterota 1.2864882 1.7873106
p__Deferribacterota 0.4369387 0.8013901
p__Bacillota_B 0.3030163 0.6242852
p__Actinomycetota 0.2918390 0.3103995
p__Spirochaetota 0.0925010 0.2807852
Genome level
both regional analyses are done in the same manner, thus only
Ittoqqortoormiit is shown.
Ittoqqortoormiit
physeq_ittoq <- subset_samples(physeq_clr, region == "Ittoqqortoormiit")
physeq_ittoq <- prune_taxa(taxa_sums(physeq_ittoq)>0, physeq_ittoq)
table.rel1_ittoq <- physeq_ittoq@otu_table
means.table.rel1_ittoq <- as.data.frame(rowMeans(table.rel1_ittoq))
sd.table.rel1_ittoq <- as.data.frame(matrixStats::rowSds(table.rel1_ittoq, useNames = TRUE))
summary.ittoq <- merge(means.table.rel1_ittoq, sd.table.rel1_ittoq, by="row.names")
colnames(summary.ittoq) <- c("Genome","Mean", "SD")
head(summary.ittoq[order(-summary.ittoq$Mean),], row.names = FALSE)
Genome Mean SD
153 EHA01921_bin.7 8.339900 1.024674
143 EHA01918_bin.24 8.219120 1.310063
17 EHA01876_bin.7 8.198880 1.187592
106 EHA01907_bin.13 8.060607 1.352825
96 EHA01904_bin.12 7.949196 1.252277
164 EHA01926_bin.12 7.943538 1.364449
Daneborg
Genome Mean SD
25 EHA01876_bin.8 9.135991 2.669166
1 EHA01871_bin.13 8.508479 2.540384
33 EHA01878_bin.29 8.305711 2.520488
59 EHA01886_bin.14 8.008138 1.588673
94 EHA01893_bin.29 7.954409 1.188865
24 EHA01876_bin.7 7.860446 2.618191
Genome
clr_t <- as.data.frame(t(as.matrix(physeq_clr@otu_table))) %>%
rownames_to_column(., "sample")%>%
dplyr::left_join(., sample_metadata[c(1,4)], by = join_by(sample == sample))
significant_genome <- clr_t %>%
pivot_longer(-c(sample,region), names_to = "Genome", values_to = "value") %>%
group_by(Genome) %>%
summarise(p_value = wilcox.test(value ~ region, exact = FALSE)$p.value) %>%
mutate(p_adjust=p.adjust(p_value, method="BH")) %>%
filter(p_adjust < 0.05)
significant_genome
# A tibble: 145 × 3
Genome p_value p_adjust
<chr> <dbl> <dbl>
1 EHA01871_bin.13 1.65e- 5 0.000145
2 EHA01871_bin.23 2.42e- 4 0.00149
3 EHA01871_bin.4 7.42e- 8 0.00000354
4 EHA01872_bin.30 1.59e- 2 0.0472
5 EHA01872_bin.35 2.68e- 5 0.000222
6 EHA01872_bin.38 7.39e- 4 0.00378
7 EHA01872_bin.6 8.94e-10 0.000000379
8 EHA01873_bin.16 1.89e- 7 0.00000478
9 EHA01875_bin.14 9.40e- 3 0.0302
10 EHA01875_bin.22 1.23e- 2 0.0377
# ℹ 135 more rows
physeq_genome_filtered_rel <- microbiome::transform(genome_data, "compositional")
genome <- as.data.frame(t(as.matrix(physeq_genome_filtered_rel@otu_table)))
significant_genome_table <- genome[, colnames(genome) %in% significant_genome$Genome] %>%
rownames_to_column(., "sample")%>%
dplyr::left_join(., sample_metadata[c(1,4)], by = join_by(sample == sample))%>%
dplyr::select(-sample)
colNames <- names(significant_genome_table)[1:3] # change to change number of plotted MAGs
for(i in colNames){
plt <- ggplot(significant_genome_table, aes(x=region, y=.data[[i]], color = region)) +
geom_boxplot(alpha = 0.2, outlier.shape = NA, width = 0.3, show.legend = FALSE) +
geom_jitter(width = 0.1, show.legend = TRUE) +
theme_minimal() +
theme(
axis.line = element_line(size = 0.5, linetype = "solid", colour = "black"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank())
print(plt)
}
only three plots are shown to illustrate the plotting function
Genera level
# A tibble: 26 × 3
Genus p_value p_adjust
<chr> <dbl> <dbl>
1 d__Bacteria_p__Deferribacterota_c__Deferribacteres_o__Defer… 2.55e-3 1.82e-2
2 d__Bacteria_p__Fusobacteriota_c__Fusobacteriia_o__Fusobacte… 6.20e-3 3.16e-2
3 g__Allobaculum 7.95e-9 4.25e-7
4 g__Anaerobiospirillum 7.48e-3 3.48e-2
5 g__Avimicrobium 1.43e-5 3.07e-4
6 g__Bacteroides 1.08e-2 4.80e-2
7 g__Blautia_A 7.51e-6 2.01e-4
8 g__CAJMNU01 3.92e-4 3.81e-3
9 g__CALUXS01 3.69e-4 3.81e-3
10 g__Campylobacter_D 5.64e-3 3.02e-2
# ℹ 16 more rows
only three plots shown to illustrate the plotting function.
Permutation test for rda under reduced model
Permutation: free
Number of permutations: 999
Model: rda(formula = count_crl_t ~ region, data = design)
Df Variance F Pr(>F)
Model 1 1420.6 9.205 0.001 ***
Residual 56 8642.3
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Permutation test for rda under reduced model
Permutation: free
Number of permutations: 999
Model: rda(formula = hel1 ~ region, data = design)
Df Variance F Pr(>F)
Model 1 0.08720 15.614 0.001 ***
Residual 56 0.31275
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
$r.squared
[1] 0.1411698
$adj.r.squared
[1] 0.1258336
$r.squared
[1] 0.2180283
$adj.r.squared
[1] 0.2040645
Permutation test for rda under reduced model
Permutation: free
Number of permutations: 999
Model: rda(formula = count_crl_t ~ region, data = design)
Df Variance F Pr(>F)
Model 1 1420.6 9.205 0.001 ***
Residual 56 8642.3
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Permutation test for rda under reduced model
Terms added sequentially (first to last)
Permutation: free
Number of permutations: 999
Model: rda(formula = count_crl_t ~ region, data = design)
Df Variance F Pr(>F)
region 1 1420.6 9.205 0.001 ***
Residual 56 8642.3
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Permutation test for rda under reduced model
Forward tests for axes
Permutation: free
Number of permutations: 999
Model: rda(formula = count_crl_t ~ region, data = design)
Df Variance F Pr(>F)
RDA1 1 1420.6 9.205 0.001 ***
Residual 56 8642.3
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
regionIttoqqortoormiit
Down 124
NotSig 178
Up 253
genome
genera
Genome
diagdds = phyloseq_to_deseq2(genome_data, ~ region)
diagdds <- estimateSizeFactors(diagdds, type="poscounts",locfunc=genefilter::shorth)
diagdds = DESeq(diagdds, test="Wald", fitType="parametric")
diagdds.ins.fis <- results(diagdds, alpha=0.01, contrast=c("region", "Ittoqqortoormiit", "Daneborg"))
sigtab_diagdds.ins.fis <- diagdds.ins.fis[which(diagdds.ins.fis$pvalue < 0.05), ]
sigtab_diagdds_with_tax <- cbind(as(sigtab_diagdds.ins.fis, "data.frame"), as(tax_table(genome_data)[row.names(sigtab_diagdds.ins.fis), ], "matrix"))
#sigtab_diagdds_with_tax[order(sigtab_diagdds_with_tax$baseMean, decreasing=T), ]
deseq2_group <- as.data.frame(sigtab_diagdds_with_tax)
theme_set(theme_bw())
scale_fill_discrete <- function(palname = "Set1", ...) {
scale_fill_brewer(palette = palname, ...)
}
x = tapply(sigtab_diagdds_with_tax$log2FoldChange, sigtab_diagdds_with_tax$order, function(x) max(x))
x = sort(x, TRUE)
sigtab_diagdds_with_tax$Family = factor(as.character(sigtab_diagdds_with_tax$order), levels=names(x))
x = tapply(sigtab_diagdds_with_tax$log2FoldChange, sigtab_diagdds_with_tax$genus, function(x) max(x))
x = sort(x, TRUE)
sigtab_diagdds_with_tax$Genus = factor(as.character(sigtab_diagdds_with_tax$genus), levels=names(x))
Genera
## 7.6 ANCOM-BC
phylo_counts_ancom <- genome_counts %>%
filter(!genome %in% structural_zeros$genome) %>% # remove structural zeros
column_to_rownames("genome") %>%
mutate_all(~ replace(., . == 0, 0.00001)) %>% #add pseudo counts to avoid structural zero issues
otu_table(., taxa_are_rows = TRUE)
#Generate phyloseq object required to input ANCOMBC, only difference is the addition of the psudocounts
physeq_ancom <- phyloseq(phylo_counts_ancom, phylo_taxonomy, phylo_samples)
Genome
library(ANCOMBC)
set.seed(1234) #set seed for reproducibility
ancom_rand_output = ancombc2(data = physeq_ancom,
assay_name = NULL,
tax_level = NULL, #change to agglomerate analysis to a higher taxonomic range
fix_formula = "region", #fixed variable(s)
# rand_formula = "(1|Individual)",
p_adj_method = "holm",
pseudo_sens = TRUE,
prv_cut = 0.10,
s0_perc = 0.05,
group = NULL,
struc_zero = FALSE,
neg_lb = FALSE,
alpha = 0.05,
n_cl = 2,
verbose = TRUE,
global = FALSE,
pairwise = FALSE,
dunnet = FALSE,
trend = FALSE,
iter_control = list(tol = 1e-5, max_iter = 20, verbose = FALSE),
em_control = list(tol = 1e-5, max_iter = 100),
lme_control = NULL,
mdfdr_control = list(fwer_ctrl_method = "holm", B = 100),
trend_control = NULL)
tax <- data.frame(physeq_ancom@tax_table) %>%
rownames_to_column(., "taxon")
ancombc_rand_table <- ancom_rand_output$res %>%
dplyr::select(taxon, lfc_regionIttoqqortoormiit, p_regionIttoqqortoormiit) %>%
filter(p_regionIttoqqortoormiit < 0.05) %>%
dplyr::arrange(p_regionIttoqqortoormiit) %>%
merge(., tax, by="taxon")
colors_alphabetic <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
# mutate_at(vars(phylum), ~ str_replace(., "[dpcofgs]__", "")) %>%
dplyr::right_join(tax, by=join_by(phylum == phylum)) %>%
dplyr::select(phylum, colors) %>%
mutate(colors = str_c(colors, "80")) %>% #add 80% alpha
unique() %>%
dplyr::arrange(phylum)
tax_table <- as.data.frame(unique(ancombc_rand_table$phylum))
colnames(tax_table)[1] <- "phylum"
tax_color <- merge(tax_table, colors_alphabetic, by="phylum")%>%
dplyr::arrange(phylum) %>%
dplyr::select(colors) %>%
pull()
Genera
library(mia)
tse_LinDA = mia::makeTreeSummarizedExperimentFromPhyloseq(genome_data) # create tse object from the phyloseq object with structural zeros removed, no pseudo counts.
tse_LinDA <- mia::subsetByPrevalentTaxa(tse_LinDA, detection = 0, prevalence = 0.1) # filter tse object
Genome
otu.tab <- as.data.frame(assay(tse_LinDA))
meta <- as.data.frame(colData(tse_LinDA)) %>%
dplyr::select(region)
LinDA <- LinDA::linda(
otu.tab,
meta,
formula = '~region',
alpha = 0.01,
prev.cut = 0, # we already filtered
winsor.quan = 0.97)
# LinDA::linda.plot(LinDA, c('region'))
Genera
## 7.8 ALDEx2
library(ALDEx2)
genome_counts_row <- column_to_rownames(genome_counts, "genome")
genome_counts_row_new <- genome_counts_row[,order(colnames(genome_counts_row))]
#sample_metadata_test <- sample_metadata[order(sample_metadata$sample %in% colnames(genome_counts_row),)]
sample_metadata_new <- sample_metadata[ order(sample_metadata$sample),]
colnames(genome_counts_row_new) %in% sample_metadata_new$sample
dim(sample_metadata_new)[1]==dim(genome_counts_row_new)[2]
genome_counts_aldex <- genome_counts_row_new %>% rownames_to_column(., "genome") %>%
filter(!genome %in% structural_zeros$genome) %>% # remove structural zeros
column_to_rownames(var="genome") %>%
mutate_all(~ . * 1e6) %>% #multiple by a million
round(0) #round to integer
genome_counts_filtered.clr <- aldex.clr(genome_counts_aldex,
sample_metadata_new$region, #not a factor
mc.samples=128,
denom="all",
verbose=F)
genome_counts_filtered.ttest <- aldex.ttest(genome_counts_filtered.clr,
hist.plot=F,
paired.test=F,
verbose=F)
genome_counts_filtered.effect <- aldex.effect(genome_counts_filtered.clr,
CI=T,
verbose=F,
include.sample.summary=F,
glm.conds=NULL,
useMC=F)
genome_counts_filtered.all <- data.frame(genome_counts_filtered.ttest,genome_counts_filtered.effect) %>%
rownames_to_column(var="genome") %>%
dplyr::left_join(genome_metadata,by=join_by(genome==genome))
Significance based on posterior probabilities
Upset Analysis
library(UpSetR)
table_upset_DA <- DA_results %>%
dplyr::select(., genome, padj_WILCOX, padj_DESEQ, padj_ANCOMBC, padj_LINDA, padj_ALDEX) %>%
dplyr::rename(
Wilcoxon = padj_WILCOX,
DESeq2 = padj_DESEQ,
ANCOMBC = padj_ANCOMBC,
LinDA = padj_LINDA,
ALDEx = padj_ALDEX
) %>%
column_to_rownames(var="genome") %>%
{.[is.na(.)] <- 0; .}
table_upset_DA_binary <- ifelse(table_upset_DA > 0, 1, 0) %>% as.data.frame()
methodcolors=c('#c4d7d1','#408892','#c04062','#6b3a59','#e08683')
#pdf("figures/MAG_intersection.pdf",width=8,height=6, onefile=F)
upset(table_upset_DA_binary,
sets = rev(c("Wilcoxon","DESeq2","ANCOMBC","LinDA","ALDEx")),
sets.bar.color= rev(methodcolors),
mb.ratio = c(0.55, 0.45), order.by = "freq")
#dev.off()
# Filtered DA Results (only MAGs found to be DA by 2+ methods)
filtered_DA_results <- DA_results %>%
dplyr::select(., genome, padj_WILCOX, padj_DESEQ, padj_ANCOMBC, padj_LINDA, padj_ALDEX) %>%
column_to_rownames(var="genome") %>%
{.[is.na(.)] <- 0; .}
filtered_DA_results <- filtered_DA_results %>%
filter(rowSums(. > 0) >= 2)
# Merge annotations
filtered_DA_results <- filtered_DA_results %>%
rownames_to_column(var="genome")
filtered_DA_results <- subset(DA_results, genome %in% filtered_DA_results$genome)
filtered_DA_results <- filtered_DA_results %>%
mutate(enrichment = ifelse(LogFold_ANCOMBC > 0 | LogFold_LINDA > 0 | LogFold_DESEQ > 0, "Ittoqqortoormiit",
ifelse(LogFold_LINDA < 0 | LogFold_DESEQ < 0, "Daneborg", NA)))
#library("writexl")
#write_xlsx(filtered_DA_results, "/Users/elsa/Desktop/THESIS/sled_dog_metagenomics\\DA_merged.xlsx")
Mags from DA with species-level annotation
filtered_DA_results%>%
filter(species != "s__") # 109 DA MAGs WITH species-level annotation therefore 77 DA MAGs are w/o species level annotation
summary data
Mean for Cetacea_. :
# A tibble: 2 × 2
region mean_Cetacea_.
<chr> <dbl>
1 Daneborg 0.883
2 Ittoqqortoormii 0.897
Mean for Pinnipedia_. :
# A tibble: 2 × 2
region mean_Pinnipedia_.
<chr> <dbl>
1 Daneborg 0.497
2 Ittoqqortoormii 0.490
Mean for Eukaryota_. :
# A tibble: 2 × 2
region mean_Eukaryota_.
<chr> <dbl>
1 Daneborg 8.86
2 Ittoqqortoormii 8.17
Mean for Cryptosporidium_. :
# A tibble: 2 × 2
region mean_Cryptosporidium_.
<chr> <dbl>
1 Daneborg 0.00576
2 Ittoqqortoormii 0.00762
Mean for Giardia_. :
# A tibble: 2 × 2
region mean_Giardia_.
<chr> <dbl>
1 Daneborg 0.000362
2 Ittoqqortoormii 0.000506
Mean for Sarcocystis_. :
# A tibble: 2 × 2
region mean_Sarcocystis_.
<chr> <dbl>
1 Daneborg 0.00002
2 Ittoqqortoormii 0.000247
Mean for Toxoplasma_gondii_. :
# A tibble: 2 × 2
region mean_Toxoplasma_gondii_.
<chr> <dbl>
1 Daneborg 0.00445
2 Ittoqqortoormii 0.00534
Mean for Trichinella_. :
# A tibble: 2 × 2
region mean_Trichinella_.
<chr> <dbl>
1 Daneborg 0.0103
2 Ittoqqortoormii 0.00831
Mean for Uncinaria_. :
# A tibble: 2 × 2
region mean_Uncinaria_.
<chr> <dbl>
1 Daneborg 0.00122
2 Ittoqqortoormii 0.000072
Mean for Echinococcus_. :
# A tibble: 2 × 2
region mean_Echinococcus_.
<chr> <dbl>
1 Daneborg 0.0129
2 Ittoqqortoormii 0.0104
Mean for Taenia_. :
# A tibble: 2 × 2
region mean_Taenia_.
<chr> <dbl>
1 Daneborg 0.00106
2 Ittoqqortoormii 0.00170
Mean for Toxascaris_. :
# A tibble: 2 × 2
region mean_Toxascaris_.
<chr> <dbl>
1 Daneborg 0.0000233
2 Ittoqqortoormii 0.000877
Mean for Cestoda_. :
# A tibble: 2 × 2
region mean_Cestoda_.
<chr> <dbl>
1 Daneborg 0.128
2 Ittoqqortoormii 0.0979
Mean for Nematoda_. :
# A tibble: 2 × 2
region mean_Nematoda_.
<chr> <dbl>
1 Daneborg 0.249
2 Ittoqqortoormii 0.269
Mean for Virus_. :
# A tibble: 2 × 2
region mean_Virus_.
<chr> <dbl>
1 Daneborg 0.0731
2 Ittoqqortoormii 0.377
Mean for PARVO_Parvoviridae_. :
# A tibble: 2 × 2
region mean_PARVO_Parvoviridae_.
<chr> <dbl>
1 Daneborg 0.262
2 Ittoqqortoormii 0.405
Mean for PARVO_Bocaparvovirus_. :
# A tibble: 2 × 2
region mean_PARVO_Bocaparvovirus_.
<chr> <dbl>
1 Daneborg 0.0563
2 Ittoqqortoormii 0.392
Mean for ADENO_Adenoviridae_. :
# A tibble: 2 × 2
region mean_ADENO_Adenoviridae_.
<chr> <dbl>
1 Daneborg 0.0124
2 Ittoqqortoormii 0.0816
Mean for PAP_Papillomaviridae_. :
# A tibble: 2 × 2
region mean_PAP_Papillomaviridae_.
<chr> <dbl>
1 Daneborg 0.0262
2 Ittoqqortoormii 0.211
Standard Deviation for Cetacea_. :
# A tibble: 2 × 2
region sd_Cetacea_.
<chr> <dbl>
1 Daneborg 0.0539
2 Ittoqqortoormii 0.0499
Standard Deviation for Pinnipedia_. :
# A tibble: 2 × 2
region sd_Pinnipedia_.
<chr> <dbl>
1 Daneborg 0.0186
2 Ittoqqortoormii 0.0724
Standard Deviation for Eukaryota_. :
# A tibble: 2 × 2
region sd_Eukaryota_.
<chr> <dbl>
1 Daneborg 1.73
2 Ittoqqortoormii 2.21
Standard Deviation for Cryptosporidium_. :
# A tibble: 2 × 2
region sd_Cryptosporidium_.
<chr> <dbl>
1 Daneborg 0.00164
2 Ittoqqortoormii 0.00319
Standard Deviation for Giardia_. :
# A tibble: 2 × 2
region sd_Giardia_.
<chr> <dbl>
1 Daneborg 0.000799
2 Ittoqqortoormii 0.000526
Standard Deviation for Sarcocystis_. :
# A tibble: 2 × 2
region sd_Sarcocystis_.
<chr> <dbl>
1 Daneborg NA
2 Ittoqqortoormii 0.000186
Standard Deviation for Toxoplasma_gondii_. :
# A tibble: 2 × 2
region sd_Toxoplasma_gondii_.
<chr> <dbl>
1 Daneborg 0.00130
2 Ittoqqortoormii 0.00501
Standard Deviation for Trichinella_. :
# A tibble: 2 × 2
region sd_Trichinella_.
<chr> <dbl>
1 Daneborg 0.00283
2 Ittoqqortoormii 0.00165
Standard Deviation for Uncinaria_. :
# A tibble: 2 × 2
region sd_Uncinaria_.
<chr> <dbl>
1 Daneborg 0.00469
2 Ittoqqortoormii 0.0000726
Standard Deviation for Echinococcus_. :
# A tibble: 2 × 2
region sd_Echinococcus_.
<chr> <dbl>
1 Daneborg 0.0164
2 Ittoqqortoormii 0.00585
Standard Deviation for Taenia_. :
# A tibble: 2 × 2
region sd_Taenia_.
<chr> <dbl>
1 Daneborg 0.00214
2 Ittoqqortoormii 0.00286
Standard Deviation for Toxascaris_. :
# A tibble: 2 × 2
region sd_Toxascaris_.
<chr> <dbl>
1 Daneborg 0.00000577
2 Ittoqqortoormii 0.000941
Standard Deviation for Cestoda_. :
# A tibble: 2 × 2
region sd_Cestoda_.
<chr> <dbl>
1 Daneborg 0.150
2 Ittoqqortoormii 0.0405
Standard Deviation for Nematoda_. :
# A tibble: 2 × 2
region sd_Nematoda_.
<chr> <dbl>
1 Daneborg 0.0662
2 Ittoqqortoormii 0.0541
Standard Deviation for Virus_. :
# A tibble: 2 × 2
region sd_Virus_.
<chr> <dbl>
1 Daneborg 0.0331
2 Ittoqqortoormii 0.798
Standard Deviation for PARVO_Parvoviridae_. :
# A tibble: 2 × 2
region sd_PARVO_Parvoviridae_.
<chr> <dbl>
1 Daneborg 0.147
2 Ittoqqortoormii 1.92
Standard Deviation for PARVO_Bocaparvovirus_. :
# A tibble: 2 × 2
region sd_PARVO_Bocaparvovirus_.
<chr> <dbl>
1 Daneborg 0.0426
2 Ittoqqortoormii 1.92
Standard Deviation for ADENO_Adenoviridae_. :
# A tibble: 2 × 2
region sd_ADENO_Adenoviridae_.
<chr> <dbl>
1 Daneborg 0.00951
2 Ittoqqortoormii 0.114
Standard Deviation for PAP_Papillomaviridae_. :
# A tibble: 2 × 2
region sd_PAP_Papillomaviridae_.
<chr> <dbl>
1 Daneborg 0.0231
2 Ittoqqortoormii 0.263
Wilcoxon testing (percentages)
wilcox_kraken <- kraken_subset %>%
pivot_longer(-c(sample, region), names_to = "Parasite", values_to = "value") %>%
group_by(Parasite) %>%
summarise(p_value = {
if (n_distinct(region) >= 2) {
wilcox.test(value ~ region, data = cur_data(), exact = TRUE)$p.value
} else {
NA
}
}) %>%
mutate(p_adjust = ifelse(!is.na(p_value), p.adjust(p_value, method = "BH"), NA))
significant_kraken <- wilcox_kraken %>%
filter(!is.na(p_adjust) & p_adjust < 0.05) %>%
knitr::kable()
Fishers Exact testing (presence / absence)
Fisher's Exact Test for Count Data
data: contingency_table
p-value = 1
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
0.02564066 Inf
sample estimates:
odds ratio
Inf